Import necessary packages and change display settings¶

In [ ]:
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook" # Let's us display Plotly figs after export to html file 

import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format # display numbers as with thousands separator
pd.set_option('display.max_colwidth', None) # display full text in columns

from IPython.display import display, HTML

# Custom CSS to align DataFrame values to the right
css_rules = """
<style>
    table.dataframe th, table.dataframe td {
        text-align: right !important;
    }
</style>
"""

# Apply the custom CSS
display(HTML(css_rules))

Import pre-cleaned data and find most viable "EARN" variables¶

In [ ]:
df = pd.read_csv('Data/clean_field_of_study.csv', dtype={'CIPFIELD': str, 'CIPCODE': str})

# Look at which "EARN" variables have the most datapoints
earn_col_counts = {}
for col in df.columns:
    if "EARN" in col:
        earn_col_counts[col] = len(df[col].unique()) -1 # subtract 1 to account for NaN

# sort dictionary by value to see which variables have the most data
earn_col_counts = {k: v for k, v in sorted(earn_col_counts.items(), key=lambda item: item[1], reverse=True)}
list(earn_col_counts.items())[:5] # Display first 5 items
Out[ ]:
[('EARN_MDN_HI_1YR', 264),
 ('EARN_MDN_1YR', 254),
 ('EARN_MDN_4YR', 244),
 ('EARN_PELL_WNE_MDN_1YR', 226),
 ('EARN_NE_MDN_3YR', 219)]

Filter data down to the key variables discovered above¶

  • EARN_MDN_1YR: 'Median earnings of graduates working and not enrolled 1 year after completing'
  • EARN_MDN_4YR: 'Median earnings of graduates working and not enrolled 4 years after completing'
  • EARN_COUNT_WNE_1YR: 'Number of graduates working and not enrolled 1 year after completing'
  • EARN_COUNT_WNE_4YR: 'Number of graduates working and not enrolled 4 years after completing'
In [ ]:
# We decide to us EARN_MDN_1YR and EARN_MDN_4YR
df = df[df.columns.to_list()[:7] + ['EARN_MDN_1YR', 'EARN_MDN_4YR', 'EARN_COUNT_WNE_1YR', 'EARN_COUNT_WNE_4YR']] 

# Get counts of nans
# Number of rows where all four EARN columns have NaN values
all_earn_nan = df[df[['EARN_MDN_1YR', 'EARN_MDN_4YR', 'EARN_COUNT_WNE_1YR', 'EARN_COUNT_WNE_4YR']].isna().all(axis=1)].shape[0]

# Number of rows where each individual EARN column has NaN values
earn_mdn_1yr_nan = df['EARN_MDN_1YR'].isna().sum()
earn_mdn_4yr_nan = df['EARN_MDN_4YR'].isna().sum()
earn_count_wne_1yr_nan = df['EARN_COUNT_WNE_1YR'].isna().sum()
earn_count_wne_4yr_nan = df['EARN_COUNT_WNE_4YR'].isna().sum()

print(f'All four EARN columns nan count: {all_earn_nan}\n1YR median earnings nan count: {earn_mdn_1yr_nan}\n4YR median earnings nan count: {earn_mdn_4yr_nan}')

# Drop rows with nan's (we lose 310 total rows -- 262 of which all four columns are nan)
df.dropna(inplace=True)
All four EARN columns nan count: 262
1YR median earnings nan count: 299
4YR median earnings nan count: 310

Explore earnings by University¶

In [ ]:
# Group earnings by universities (summary stats of all programs with public record)
univ_1yr_stats = df.groupby('INSTNM').agg({'EARN_COUNT_WNE_1YR': 'sum','EARN_MDN_1YR': ['mean', 'min', 'max']}).reset_index()
univ_1yr_stats = univ_1yr_stats.sort_values(by=('EARN_MDN_1YR', 'mean'), ascending=False).reset_index(drop=True)
univ_1yr_stats
Out[ ]:
INSTNM EARN_COUNT_WNE_1YR EARN_MDN_1YR
sum mean min max
0 University of Utah 3,280.00 47,227.86 25,918.00 85,652.00
1 Brigham Young University 3,652.00 47,174.15 17,223.00 86,469.00
2 Westminster College 155.00 46,325.40 31,263.00 59,998.00
3 Weber State University 2,198.00 46,260.09 17,779.00 77,117.00
4 Utah Valley University 3,134.00 44,682.09 22,648.00 82,864.00
5 Utah State University 3,427.00 39,905.87 21,265.00 67,878.00
6 Southern Utah University 525.00 35,300.25 19,031.00 58,924.00
In [ ]:
# Group earnings by universities (summary stats of all programs with public record)
univ_4yr_stats = df.groupby('INSTNM').agg({'EARN_COUNT_WNE_4YR': 'sum','EARN_MDN_4YR': ['mean', 'min', 'max']}).reset_index()
univ_4yr_stats = univ_4yr_stats.sort_values(by=('EARN_MDN_4YR', 'mean'), ascending=False).reset_index(drop=True)
univ_4yr_stats
Out[ ]:
INSTNM EARN_COUNT_WNE_4YR EARN_MDN_4YR
sum mean min max
0 Brigham Young University 3,543.00 63,373.25 23,498.00 127,592.00
1 University of Utah 1,560.00 61,987.43 37,398.00 99,562.00
2 Westminster College 196.00 60,817.40 50,739.00 67,444.00
3 Utah Valley University 2,730.00 57,600.16 16,331.00 103,513.00
4 Weber State University 2,142.00 55,969.97 32,470.00 89,974.00
5 Utah State University 2,921.00 52,193.71 25,367.00 91,862.00
6 Southern Utah University 475.00 44,503.17 31,561.00 62,156.00

Visualize University-Level Data¶

In [ ]:
fig = go.Figure()

univs = df['INSTNM'].unique().tolist() # Get list of unique universities

# Create Violin plot for each University
for univ in univs:
    fig.add_trace(go.Violin(x=df['INSTNM'][df['INSTNM'] == univ],
                            y=df['EARN_MDN_4YR'][df['INSTNM'] == univ],
                            name=univ,
                            box_visible=True,
                            meanline_visible=True,
                            showlegend=False))

# Format the plot
fig.update_yaxes(tickformat="$,.0f")
fig.update_layout(
    title="Earnings 4 Years after Completion Grouped by University",
    yaxis_title="Earnings",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)

fig.show()
In [ ]:
fig = go.Figure()

univs = df['INSTNM'].unique().tolist() # Get list of unique universities

# Create Violin plot for each University
for univ in univs:
    fig.add_trace(go.Violin(x=df['INSTNM'][df['INSTNM'] == univ],
                            y=df['EARN_MDN_1YR'][df['INSTNM'] == univ],
                            name=univ,
                            box_visible=True,
                            meanline_visible=True,
                            showlegend=False))

# Format the plot
fig.update_yaxes(tickformat="$,.0f")
fig.update_layout(
    title="Earnings 1 Years after Completion Grouped by University",
    yaxis_title="Earnings",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)

fig.show()

Explore earnings by general Field of Study¶

Field of study defined by first two digits of CIPCODE

In [ ]:
# Group earnings for each field of study (summary of all programs with public record)
field_1yr_stats = df.groupby('CIPDEF').agg({'EARN_COUNT_WNE_1YR': 'sum','EARN_MDN_1YR': ['mean', 'min', 'max']}).reset_index()
field_1yr_stats = field_1yr_stats.sort_values(by=('EARN_MDN_1YR', 'mean'), ascending=False).reset_index(drop=True)
field_1yr_stats
Out[ ]:
CIPDEF EARN_COUNT_WNE_1YR EARN_MDN_1YR
sum mean min max
0 ENGINEERING. 1,053.00 69,412.13 60,203.00 82,864.00
1 COMPUTER AND INFORMATION SCIENCES AND SUPPORT SERVICES. 1,047.00 69,317.67 37,101.00 86,469.00
2 CONSTRUCTION TRADES. 55.00 68,886.00 68,886.00 68,886.00
3 ENGINEERING/ENGINEERING-RELATED TECHNOLOGIES/TECHNICIANS. 79.00 65,609.00 62,141.00 69,077.00
4 MATHEMATICS AND STATISTICS. 94.00 57,342.50 56,462.00 58,223.00
5 BUSINESS, MANAGEMENT, MARKETING, AND RELATED SUPPORT SERVICES. 2,807.00 55,107.69 33,485.00 74,883.00
6 TRANSPORTATION AND MATERIALS MOVING. 212.00 50,560.50 44,128.00 56,993.00
7 PHYSICAL SCIENCES. 78.00 50,117.33 38,443.00 65,065.00
8 COMMUNICATIONS TECHNOLOGIES/TECHNICIANS AND SUPPORT SERVICES. 24.00 48,703.00 48,703.00 48,703.00
9 HEALTH PROFESSIONS AND RELATED PROGRAMS. 2,074.00 46,428.11 27,319.00 63,701.00
10 HOMELAND SECURITY, LAW ENFORCEMENT, FIREFIGHTING AND RELATED PROTECTIVE SERVICES. 305.00 44,299.50 36,865.00 56,578.00
11 ARCHITECTURE AND RELATED SERVICES. 60.00 44,067.50 39,464.00 48,671.00
12 EDUCATION. 1,295.00 43,397.00 33,949.00 57,508.00
13 SOCIAL SCIENCES. 953.00 40,257.64 26,905.00 56,599.00
14 FOREIGN LANGUAGES, LITERATURES, AND LINGUISTICS. 255.00 38,294.57 29,046.00 49,724.00
15 AGRICULTURAL/ANIMAL/PLANT/VETERINARY SCIENCE AND RELATED FIELDS. 108.00 37,836.50 30,278.00 45,395.00
16 LIBERAL ARTS AND SCIENCES, GENERAL STUDIES AND HUMANITIES. 291.00 36,959.00 30,086.00 42,698.00
17 COMMUNICATION, JOURNALISM, AND RELATED PROGRAMS. 900.00 36,397.67 25,858.00 46,269.00
18 PUBLIC ADMINISTRATION AND SOCIAL SERVICE PROFESSIONS. 213.00 34,515.00 30,124.00 38,055.00
19 AREA, ETHNIC, CULTURAL, GENDER, AND GROUP STUDIES. 95.00 33,436.50 33,213.00 33,660.00
20 PARKS, RECREATION, LEISURE, FITNESS, AND KINESIOLOGY. 580.00 32,997.71 23,297.00 45,748.00
21 MULTI/INTERDISCIPLINARY STUDIES. 152.00 32,400.33 28,338.00 37,755.00
22 ENGLISH LANGUAGE AND LITERATURE/LETTERS. 423.00 32,389.29 28,928.00 41,726.00
23 HISTORY. 160.00 31,192.00 28,541.00 33,397.00
24 PSYCHOLOGY. 1,019.00 29,626.17 24,986.00 32,569.00
25 BIOLOGICAL AND BIOMEDICAL SCIENCES. 653.00 29,471.78 20,357.00 38,438.00
26 VISUAL AND PERFORMING ARTS. 765.00 28,146.20 17,223.00 39,217.00
27 FAMILY AND CONSUMER SCIENCES/HUMAN SCIENCES. 621.00 26,417.86 19,031.00 32,998.00
In [ ]:
# Group earnings for each field of study (summary of all programs with public record)
field_4yr_stats = df.groupby('CIPDEF').agg({'EARN_COUNT_WNE_4YR': 'sum','EARN_MDN_4YR': ['mean', 'min', 'max']}).reset_index()
field_4yr_stats = field_4yr_stats.sort_values(by=('EARN_MDN_4YR', 'mean'), ascending=False).reset_index(drop=True)
field_4yr_stats
Out[ ]:
CIPDEF EARN_COUNT_WNE_4YR EARN_MDN_4YR
sum mean min max
0 COMPUTER AND INFORMATION SCIENCES AND SUPPORT SERVICES. 826.00 88,258.00 48,538.00 114,046.00
1 ENGINEERING. 855.00 87,750.60 73,089.00 127,592.00
2 MATHEMATICS AND STATISTICS. 59.00 79,298.00 69,595.00 89,001.00
3 CONSTRUCTION TRADES. 37.00 78,807.00 78,807.00 78,807.00
4 ENGINEERING/ENGINEERING-RELATED TECHNOLOGIES/TECHNICIANS. 104.00 78,380.50 72,469.00 84,292.00
5 COMMUNICATIONS TECHNOLOGIES/TECHNICIANS AND SUPPORT SERVICES. 23.00 74,650.00 74,650.00 74,650.00
6 BUSINESS, MANAGEMENT, MARKETING, AND RELATED SUPPORT SERVICES. 2,427.00 72,721.94 50,174.00 97,349.00
7 TRANSPORTATION AND MATERIALS MOVING. 230.00 70,107.50 59,224.00 80,991.00
8 PHYSICAL SCIENCES. 59.00 69,189.33 61,624.00 76,268.00
9 ARCHITECTURE AND RELATED SERVICES. 45.00 62,267.00 61,739.00 62,795.00
10 SOCIAL SCIENCES. 729.00 56,863.71 30,608.00 81,529.00
11 HOMELAND SECURITY, LAW ENFORCEMENT, FIREFIGHTING AND RELATED PROTECTIVE SERVICES. 381.00 55,988.75 43,401.00 70,785.00
12 BIOLOGICAL AND BIOMEDICAL SCIENCES. 468.00 54,691.33 35,147.00 73,566.00
13 FOREIGN LANGUAGES, LITERATURES, AND LINGUISTICS. 291.00 53,570.57 33,074.00 66,679.00
14 HEALTH PROFESSIONS AND RELATED PROGRAMS. 1,681.00 52,063.72 23,498.00 68,640.00
15 AREA, ETHNIC, CULTURAL, GENDER, AND GROUP STUDIES. 87.00 50,107.00 44,503.00 55,711.00
16 COMMUNICATION, JOURNALISM, AND RELATED PROGRAMS. 664.00 49,233.00 42,911.00 58,056.00
17 AGRICULTURAL/ANIMAL/PLANT/VETERINARY SCIENCE AND RELATED FIELDS. 64.00 47,654.50 38,971.00 56,338.00
18 MULTI/INTERDISCIPLINARY STUDIES. 196.00 46,442.00 39,623.00 56,942.00
19 PUBLIC ADMINISTRATION AND SOCIAL SERVICE PROFESSIONS. 244.00 45,043.00 39,435.00 47,214.00
20 EDUCATION. 1,189.00 44,786.85 36,162.00 49,647.00
21 LIBERAL ARTS AND SCIENCES, GENERAL STUDIES AND HUMANITIES. 256.00 44,328.25 32,839.00 51,593.00
22 HISTORY. 140.00 43,957.00 36,231.00 54,987.00
23 PARKS, RECREATION, LEISURE, FITNESS, AND KINESIOLOGY. 347.00 42,995.57 25,700.00 56,514.00
24 ENGLISH LANGUAGE AND LITERATURE/LETTERS. 419.00 42,187.71 38,592.00 46,995.00
25 PSYCHOLOGY. 693.00 39,981.00 31,561.00 44,664.00
26 VISUAL AND PERFORMING ARTS. 628.00 35,614.53 16,331.00 47,300.00
27 FAMILY AND CONSUMER SCIENCES/HUMAN SCIENCES. 425.00 34,970.86 24,823.00 44,984.00

Visualize Field of Study Data¶

In [ ]:
fig = go.Figure()

fields = df['CIPFIELD'].unique().tolist() # Get list of unique fields of study

# Create violin plot for each field of study
for field in fields:
    fig.add_trace(go.Violin(x=df['CIPFIELD'][df['CIPFIELD'] == field],
                            y=df['EARN_MDN_4YR'][df['CIPFIELD'] == field],
                            name=field,
                            box_visible=True,
                            meanline_visible=True,
                            showlegend=False))

# Format the plot
fig.update_yaxes(tickformat="$,.0f")
fig.update_layout(
    title="Earnings 4 Years after Completion Grouped by Field of Study",
    yaxis_title="Earnings",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)

fig.show()
In [ ]:
fig = go.Figure()

fields = df['CIPFIELD'].unique().tolist() # Get list of unique fields of study

# Create violin plot for each field of study
for field in fields:
    fig.add_trace(go.Violin(x=df['CIPFIELD'][df['CIPFIELD'] == field],
                            y=df['EARN_MDN_1YR'][df['CIPFIELD'] == field],
                            name=field,
                            box_visible=True,
                            meanline_visible=True,
                            showlegend=False))

# Format the plot
fig.update_yaxes(tickformat="$,.0f")
fig.update_layout(
    title="Earnings 1 Year after Completion Grouped by Field of Study",
    yaxis_title="Earnings",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)

fig.show()

Explore earnings by general Field of Study within each University¶

In [ ]:
# Group earnings for each field of study for each university
univ_field_1yr = df.groupby(['INSTNM', 'CIPDEF']).agg({'EARN_COUNT_WNE_1YR': 'sum','EARN_MDN_1YR': ['mean', 'min', 'max']}).reset_index()
univ_field_1yr.to_excel('Data/univ_field_1yr.xlsx')
In [ ]:
# Group earnings for each field of study for each university
univ_field_4yr = df.groupby(['INSTNM', 'CIPDEF']).agg({'EARN_COUNT_WNE_4YR': 'sum','EARN_MDN_4YR': ['mean', 'min', 'max']}).reset_index()
univ_field_4yr.to_excel('Data/univ_field_4yr.xlsx')